suppressMessages(
suppressWarnings(
c(library(data.table),
library(tidyverse),
library(rtracklayer),
library(DESeq2),
library(Biostrings),
library(Rsubread),
library(pheatmap),
library(gridExtra),
library(ggrepel),
library(CLIPanalyze),
library(ggplot2),
library(RColorBrewer),
library(mixOmics),
library(plotly))
)
)
## [1] "data.table" "stats" "graphics"
## [4] "grDevices" "utils" "datasets"
## [7] "methods" "base" "forcats"
## [10] "stringr" "dplyr" "purrr"
## [13] "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "data.table"
## [19] "stats" "graphics" "grDevices"
## [22] "utils" "datasets" "methods"
## [25] "base" "rtracklayer" "GenomicRanges"
## [28] "GenomeInfoDb" "IRanges" "S4Vectors"
## [31] "BiocGenerics" "parallel" "stats4"
## [34] "forcats" "stringr" "dplyr"
## [37] "purrr" "readr" "tidyr"
## [40] "tibble" "ggplot2" "tidyverse"
## [43] "data.table" "stats" "graphics"
## [46] "grDevices" "utils" "datasets"
## [49] "methods" "base" "DESeq2"
## [52] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [55] "matrixStats" "Biobase" "rtracklayer"
## [58] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [61] "S4Vectors" "BiocGenerics" "parallel"
## [64] "stats4" "forcats" "stringr"
## [67] "dplyr" "purrr" "readr"
## [70] "tidyr" "tibble" "ggplot2"
## [73] "tidyverse" "data.table" "stats"
## [76] "graphics" "grDevices" "utils"
## [79] "datasets" "methods" "base"
## [82] "Biostrings" "XVector" "DESeq2"
## [85] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [88] "matrixStats" "Biobase" "rtracklayer"
## [91] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [94] "S4Vectors" "BiocGenerics" "parallel"
## [97] "stats4" "forcats" "stringr"
## [100] "dplyr" "purrr" "readr"
## [103] "tidyr" "tibble" "ggplot2"
## [106] "tidyverse" "data.table" "stats"
## [109] "graphics" "grDevices" "utils"
## [112] "datasets" "methods" "base"
## [115] "Rsubread" "Biostrings" "XVector"
## [118] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [121] "BiocParallel" "matrixStats" "Biobase"
## [124] "rtracklayer" "GenomicRanges" "GenomeInfoDb"
## [127] "IRanges" "S4Vectors" "BiocGenerics"
## [130] "parallel" "stats4" "forcats"
## [133] "stringr" "dplyr" "purrr"
## [136] "readr" "tidyr" "tibble"
## [139] "ggplot2" "tidyverse" "data.table"
## [142] "stats" "graphics" "grDevices"
## [145] "utils" "datasets" "methods"
## [148] "base" "pheatmap" "Rsubread"
## [151] "Biostrings" "XVector" "DESeq2"
## [154] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [157] "matrixStats" "Biobase" "rtracklayer"
## [160] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [163] "S4Vectors" "BiocGenerics" "parallel"
## [166] "stats4" "forcats" "stringr"
## [169] "dplyr" "purrr" "readr"
## [172] "tidyr" "tibble" "ggplot2"
## [175] "tidyverse" "data.table" "stats"
## [178] "graphics" "grDevices" "utils"
## [181] "datasets" "methods" "base"
## [184] "gridExtra" "pheatmap" "Rsubread"
## [187] "Biostrings" "XVector" "DESeq2"
## [190] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [193] "matrixStats" "Biobase" "rtracklayer"
## [196] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [199] "S4Vectors" "BiocGenerics" "parallel"
## [202] "stats4" "forcats" "stringr"
## [205] "dplyr" "purrr" "readr"
## [208] "tidyr" "tibble" "ggplot2"
## [211] "tidyverse" "data.table" "stats"
## [214] "graphics" "grDevices" "utils"
## [217] "datasets" "methods" "base"
## [220] "ggrepel" "gridExtra" "pheatmap"
## [223] "Rsubread" "Biostrings" "XVector"
## [226] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [229] "BiocParallel" "matrixStats" "Biobase"
## [232] "rtracklayer" "GenomicRanges" "GenomeInfoDb"
## [235] "IRanges" "S4Vectors" "BiocGenerics"
## [238] "parallel" "stats4" "forcats"
## [241] "stringr" "dplyr" "purrr"
## [244] "readr" "tidyr" "tibble"
## [247] "ggplot2" "tidyverse" "data.table"
## [250] "stats" "graphics" "grDevices"
## [253] "utils" "datasets" "methods"
## [256] "base" "CLIPanalyze" "GenomicAlignments"
## [259] "Rsamtools" "GenomicFeatures" "AnnotationDbi"
## [262] "plyr" "ggrepel" "gridExtra"
## [265] "pheatmap" "Rsubread" "Biostrings"
## [268] "XVector" "DESeq2" "SummarizedExperiment"
## [271] "DelayedArray" "BiocParallel" "matrixStats"
## [274] "Biobase" "rtracklayer" "GenomicRanges"
## [277] "GenomeInfoDb" "IRanges" "S4Vectors"
## [280] "BiocGenerics" "parallel" "stats4"
## [283] "forcats" "stringr" "dplyr"
## [286] "purrr" "readr" "tidyr"
## [289] "tibble" "ggplot2" "tidyverse"
## [292] "data.table" "stats" "graphics"
## [295] "grDevices" "utils" "datasets"
## [298] "methods" "base" "CLIPanalyze"
## [301] "GenomicAlignments" "Rsamtools" "GenomicFeatures"
## [304] "AnnotationDbi" "plyr" "ggrepel"
## [307] "gridExtra" "pheatmap" "Rsubread"
## [310] "Biostrings" "XVector" "DESeq2"
## [313] "SummarizedExperiment" "DelayedArray" "BiocParallel"
## [316] "matrixStats" "Biobase" "rtracklayer"
## [319] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [322] "S4Vectors" "BiocGenerics" "parallel"
## [325] "stats4" "forcats" "stringr"
## [328] "dplyr" "purrr" "readr"
## [331] "tidyr" "tibble" "ggplot2"
## [334] "tidyverse" "data.table" "stats"
## [337] "graphics" "grDevices" "utils"
## [340] "datasets" "methods" "base"
## [343] "RColorBrewer" "CLIPanalyze" "GenomicAlignments"
## [346] "Rsamtools" "GenomicFeatures" "AnnotationDbi"
## [349] "plyr" "ggrepel" "gridExtra"
## [352] "pheatmap" "Rsubread" "Biostrings"
## [355] "XVector" "DESeq2" "SummarizedExperiment"
## [358] "DelayedArray" "BiocParallel" "matrixStats"
## [361] "Biobase" "rtracklayer" "GenomicRanges"
## [364] "GenomeInfoDb" "IRanges" "S4Vectors"
## [367] "BiocGenerics" "parallel" "stats4"
## [370] "forcats" "stringr" "dplyr"
## [373] "purrr" "readr" "tidyr"
## [376] "tibble" "ggplot2" "tidyverse"
## [379] "data.table" "stats" "graphics"
## [382] "grDevices" "utils" "datasets"
## [385] "methods" "base" "mixOmics"
## [388] "lattice" "MASS" "RColorBrewer"
## [391] "CLIPanalyze" "GenomicAlignments" "Rsamtools"
## [394] "GenomicFeatures" "AnnotationDbi" "plyr"
## [397] "ggrepel" "gridExtra" "pheatmap"
## [400] "Rsubread" "Biostrings" "XVector"
## [403] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [406] "BiocParallel" "matrixStats" "Biobase"
## [409] "rtracklayer" "GenomicRanges" "GenomeInfoDb"
## [412] "IRanges" "S4Vectors" "BiocGenerics"
## [415] "parallel" "stats4" "forcats"
## [418] "stringr" "dplyr" "purrr"
## [421] "readr" "tidyr" "tibble"
## [424] "ggplot2" "tidyverse" "data.table"
## [427] "stats" "graphics" "grDevices"
## [430] "utils" "datasets" "methods"
## [433] "base" "plotly" "mixOmics"
## [436] "lattice" "MASS" "RColorBrewer"
## [439] "CLIPanalyze" "GenomicAlignments" "Rsamtools"
## [442] "GenomicFeatures" "AnnotationDbi" "plyr"
## [445] "ggrepel" "gridExtra" "pheatmap"
## [448] "Rsubread" "Biostrings" "XVector"
## [451] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [454] "BiocParallel" "matrixStats" "Biobase"
## [457] "rtracklayer" "GenomicRanges" "GenomeInfoDb"
## [460] "IRanges" "S4Vectors" "BiocGenerics"
## [463] "parallel" "stats4" "forcats"
## [466] "stringr" "dplyr" "purrr"
## [469] "readr" "tidyr" "tibble"
## [472] "ggplot2" "tidyverse" "data.table"
## [475] "stats" "graphics" "grDevices"
## [478] "utils" "datasets" "methods"
## [481] "base"
Load the rds file from all samples CLIP analysis and use MA plot to visuaize that sequence depth are balanced between CLIP and IC libraries, and the peaks that were called.
peak.data<- readRDS("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/joint_analysis/peakdata.2020-03-08.rds")
plotMA(peak.data$gene.counts.nopeaks,
main = "MA plot for all HEAP vs IC\n in genes outside peaks")
plotMA(peak.data$res.deseq,
main = "MA plot for all HEAP vs IC in peaks,\none-sided test")
Prepare the peak, assign scores to KRas peaks by adjusted p-value.
peaks.all <- peak.data$peaks
peaks.all <- subset(peaks.all, width > 20)
peaks.all <- subset(peaks.all, log2FC > 0)
peaks.all <- keepStandardChromosomes(peaks.all)
score(peaks.all) <- -log10(peaks.all$padj)
length(peaks.all)
## [1] 6150
summary(width(peaks.all))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.0 51.0 55.0 58.9 62.0 167.0
count.threshold <- 10
norm.counts <- counts(peak.data$peak.counts, normalized = TRUE)
norm.counts <- norm.counts[names(peaks.all), ]
colnames(norm.counts)[1:6] <- c(paste0("HF", 4:6), paste0("HFK", 4:6))
colnames(norm.counts)[7:12] <- c(paste0("HF-IC", 4:6), paste0("HFK-IC", 4:6))
selected.peaks <- (rowMeans(norm.counts[, paste0("HF", 4:6)]) > count.threshold) |
(rowMeans(norm.counts[, paste0("HFK", 4:6)]) > count.threshold)
peaks.all <- peaks.all[selected.peaks, ]
length(peaks.all)
## [1] 3861
summary(width(peaks.all))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 51.00 55.00 57.66 61.00 163.00
peaks.all.overlaps <- peak.data[[2]]
peaks.all.overlaps <-
peaks.all.overlaps[name %in% names(peaks.all)]
peaks.all.lncRNA <-
peaks.all.overlaps[gene_type %in% c("lincRNA", "antisense",
"processed_transcript")]
peaks.all.lncRNA.arbitrary <- peaks.all.lncRNA[, .(name, gene_name)]
peaks.all.lncRNA.arbitrary <-
unique(peaks.all.lncRNA.arbitrary, by = "name")
peaks.all$lncRNA <- as.character(NA)
peaks.all[peaks.all.lncRNA.arbitrary$name, ]$lncRNA <-
peaks.all.lncRNA.arbitrary$gene_name
peaks.all$annot <- ifelse(is.na(peaks.all$lncRNA),
peaks.all$annot,
"lncRNA")
padj.threshold <- 0.05
filter.peaks <- subset(peaks.all, padj < padj.threshold)
score(filter.peaks) <- -log10(filter.peaks$padj)
length(filter.peaks)
## [1] 3300
summary(width(filter.peaks))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 51.00 54.00 57.29 60.00 163.00
# HF peak filtering
peak.data.hf <-
peak.data$peak.counts
colnames(peak.data.hf) <- c(paste0("HF", 4:6), paste0("HFK", 4:6), paste0("HF-IC", 4:6), paste0("HFK-IC", 4:6))
peak.data.hf <-
peak.data.hf[names(peaks.all),
c("HF4", "HF5", "HF6", "HF-IC4", "HF-IC5", "HF-IC6")]
design(peak.data.hf) <- ~ condition
peak.data.hf <- DESeq(peak.data.hf)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotMA(peak.data.hf,
main = "MA plot for HF over input\nin all peaks",
xlab = "Mean of normalized counts")
res.hf <- results(peak.data.hf)
peaks.all$hf.padj <- as.numeric(NA)
peaks.all$hf.log2FC <- as.numeric(NA)
peaks.all[rownames(res.hf), ]$hf.padj <-
res.hf$padj
peaks.all[rownames(res.hf), ]$hf.log2FC <-
res.hf$log2FoldChange
peaks.hf <- subset(peaks.all,
padj < padj.threshold & log2FC > 0 &
hf.padj < padj.threshold &
hf.log2FC > 0)
score(peaks.hf) <- -log10(peaks.hf$hf.padj)
length(peaks.hf)
## [1] 2239
length(subsetByOverlaps(peaks.hf, filter.peaks))
## [1] 2239
summary(width(peaks.hf))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 51.00 54.00 56.64 59.00 163.00
peak.data.hfk <-
peak.data$peak.counts
colnames(peak.data.hfk) <- c(paste0("HF", 4:6), paste0("HFK", 4:6), paste0("HF-IC", 4:6), paste0("HFK-IC", 4:6))
peak.data.hfk <-
peak.data.hfk[names(peaks.all),
c("HFK4", "HFK5", "HFK6", "HFK-IC4", "HFK-IC5", "HFK-IC6")]
design(peak.data.hfk) <- ~ condition
peak.data.hfk <- DESeq(peak.data.hfk)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotMA(peak.data.hfk,
main = "MA plot for HFK over input\nin all peaks",
xlab = "Mean of normalized counts")
res.hfk <- results(peak.data.hfk)
peaks.all$hfk.padj <- as.numeric(NA)
peaks.all$hfk.log2FC <- as.numeric(NA)
peaks.all[rownames(res.hfk), ]$hfk.padj <-
res.hfk$padj
peaks.all[rownames(res.hfk), ]$hfk.log2FC <-
res.hfk$log2FoldChange
peaks.hfk <- subset(peaks.all,
padj < padj.threshold & log2FC > 0 &
hfk.padj < padj.threshold &
hfk.log2FC > 0)
score(peaks.hfk) <- -log10(peaks.hfk$hfk.padj)
length(peaks.hfk)
## [1] 2871
length(subsetByOverlaps(peaks.hfk, filter.peaks))
## [1] 2871
summary(width(peaks.hfk))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 51.00 54.00 57.02 60.00 163.00
colnames(peak.data$peak.counts)[1:6] <- c(paste0("HF", 4:6), paste0("HFK", 4:6))
dds.hfk.hf <-
peak.data$peak.counts[names(filter.peaks),
c(paste0("HF", 4:6), paste0("HFK", 4:6))]
dds.hfk.hf$sampletype <-
factor(c(rep("HF", 3), rep("HFK", 3)), levels = c("HF", "HFK"))
design(dds.hfk.hf) <- ~ sampletype
dds.hfk.hf <- DESeq(dds.hfk.hf)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.hfk.hf <- results(dds.hfk.hf, contrast = c("sampletype", "HFK", "HF"))
plotMA(res.hfk.hf,
main = "MA plot for HFK over HF\nin all peaks (significant over input)",
xlab = "Mean of normalized counts")
summary(res.hfk.hf)
##
## out of 3300 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 1009, 31%
## LFC < 0 (down) : 902, 27%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
dds_transform <- varianceStabilizingTransformation(dds.hfk.hf)
rawCountTable_transform <- as.data.frame(assay(dds_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
Final output is following:
plotPCA(dds_transform, intgroup = "sampletype", ntop = 500) +
geom_text(aes(label=name), vjust = 2, hjust = -0.1) + xlim(-50,60) + ylim(-40,50)
peaks.all$hfk.hf.padj <- as.numeric(NA)
peaks.all$hfk.hf.log2FC <- as.numeric(NA)
peaks.all[rownames(res.hfk.hf), ]$hfk.hf.padj <-
res.hfk.hf$padj
peaks.all[rownames(res.hfk.hf), ]$hfk.hf.log2FC <-
res.hfk.hf$log2FoldChange
Save the datasets
saveRDS(peaks.all, "Datafiles/peaks-all-12232019.rds")
filter.peaks <- subset(peaks.all, padj < padj.threshold)
saveRDS(filter.peaks, "Datafiles/peaks-filtered-12232019.rds")
Map seed to peaks
mirna.info.family <- readRDS("mirna-info-family-seedmatches.rds")
assignMirToPeaks <- function(miRNA = mirs,
peaks = es.peaks.utr3,
database = mirna.info.family){
require(BSgenome)
require(CLIPanalyze)
require(Biostrings)
bsgenome <- load.bsgenome("mm10")
peaks.seq <- get.seqs(bsgenome, peaks)
peaks$seed.8mer <- as.character(NA)
peaks$seed.7m8 <- as.character(NA)
peaks$seed.7A1 <- as.character(NA)
peaks$seed.6mer <- as.character(NA)
for (i in 1:length(miRNA)){
mir <- miRNA[i]
#Prepare seed matches
mir.6m <- as.character(database[database$miR.family %in% mir, ]$seedmatch.6)
mir.7m8 <- as.character(database[database$miR.family %in% mir, ]$seedmatch.m8)
mir.7mA <- as.character(database[database$miR.family %in% mir, ]$seedmatch.A1)
mir.8m <- as.character(database[database$miR.family %in% mir, ]$seedmatch.8)
#Filter peaks with 6mer matches
match <- GRanges(vmatchPattern(mir.6m, peaks.seq))
#Go back to peaks and map the seed match and extend 1 nt on both directions
match.extend <- peaks[seqnames(match),]
match.strand <- as.logical(strand(match.extend) == "+")
start(match.extend[match.strand,]) <- start(match.extend[match.strand,]) + start(match[match.strand,]) - 2
match.extend[match.strand,] <- resize(match.extend[match.strand,], fix = "start", width = 8)
end(match.extend[!match.strand,]) <- end(match.extend[!match.strand,]) - start(match[!match.strand,]) + 2
match.extend[!match.strand,] <- resize(match.extend[!match.strand,], fix = "start", width = 8)
#Assign seed types to these matches
match.extend.seq <- get.seqs(bsgenome, match.extend)
match.extend.seq.df <- data.frame(Peaks = names(match.extend.seq),
Sequence = as.character(match.extend.seq))
for (j in 1:nrow(match.extend.seq.df)){
peak.name <- as.character(match.extend.seq.df[j, "Peaks"])
seq <- as.character(match.extend.seq.df[j, "Sequence"])
if (grepl(mir.8m, seq)){
peaks[peak.name, ]$seed.8mer <- ifelse(is.na(peaks[peak.name, ]$seed.8mer),
mir,
paste(peaks[peak.name, ]$seed.8mer, mir, sep = ", "))
} else if (grepl(mir.7m8, seq)){
peaks[peak.name, ]$seed.7m8 <- ifelse(is.na(peaks[peak.name, ]$seed.7m8),
mir,
paste(peaks[peak.name, ]$seed.7m8, mir, sep = ", "))
} else if (grepl(mir.7mA, seq)){
peaks[peak.name, ]$seed.7A1 <- ifelse(is.na(peaks[peak.name, ]$seed.7A1),
mir,
paste(peaks[peak.name, ]$seed.7A1, mir, sep = ", "))
} else {
peaks[peak.name, ]$seed.6mer <- ifelse(is.na(peaks[peak.name, ]$seed.6mer),
mir,
paste(peaks[peak.name, ]$seed.6mer, mir, sep = ", "))
}
}
}
return(peaks)
}
miRNAs with mean counts larger than 200 are selected and mapped to peaks
mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-12232019.rds")
mirs <- subset(mirna.family.DGE, baseMean > 200)
mirs <- mirs$miR.family
#peaks.mirs <- assignMirToPeaks(miRNA = mirs,
# peaks = filter.peaks,
# database = mirna.info.family)
#saveRDS(peaks.mirs, "Datafiles/peaks-mirs-200-12232019.rds")
peaks.mirs <- readRDS("Datafiles/peaks-mirs-200-12232019.rds")
targetofmiR <- function(peaks.mir = brain.peaks.mirs,
miRNA = "",
sitetype = "8mer"){
peaks.mir.sub <- as.data.frame(peaks.mir[,c("log2FC", "padj",
"seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")])
peaks.seedmatch <- lapply(c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer"),
function(seed){
map <- peaks.mir.sub[grepl(miRNA, peaks.mir.sub[,seed]),]
map <- rownames(map)
map
})
names(peaks.seedmatch) <- c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")
if (sitetype == "8mer"){
maps <- peaks.seedmatch[[1]]
} else if (sitetype == "7mer_above"){
maps <- unique(unlist(peaks.seedmatch[1:3]))
} else if (sitetype == "7mer"){
maps <- unique(unlist(peaks.seedmatch[2:3]))
} else if (sitetype == "6mer"){
maps <- peaks.seedmatch[[4]]
} else {
print("Please input site type as: 8mer, 7mer_above, 7mer or 6mer")
}
return(peaks.mir[maps])
}
mirs.peaks <- lapply(mirs,
function(mir){
targetofmiR(miRNA = mir,
peaks.mir = peaks.mirs,
sitetype = "7mer_above")
})
names(mirs.peaks) <- mirs
saveRDS(mirs.peaks, "Datafiles/miRNA-peaks-list-12232019.rds")
lens <- as.data.frame(sapply(mirs.peaks, function(x) length(x)))
mirs.peaks.log2FC.median <- sapply(mirs.peaks,
function(list){
log2FCs <- list$hfk.hf.log2FC
return(median(log2FCs))
})
mirs.peaks.log2FC.mean <- sapply(mirs.peaks,
function(list){
log2FCs <- list$hfk.hf.log2FC
return(mean(log2FCs))
})
mirs.targets.log2FC <- cbind(as.data.frame(mirs.peaks.log2FC.median),
as.data.frame(mirs.peaks.log2FC.mean),
lens)
colnames(mirs.targets.log2FC) <- c("targets_log2FC_median","targets_log2FC_mean", "N")
mirs.targets.log2FC <- merge(mirs.targets.log2FC, mirna.family.DGE[,c("log2FoldChange", "padj")], by = 0)
colnames(mirs.targets.log2FC)[1] <- "miR.family"
Show any miRNA that has more than 20 peaks matched
df <- subset(mirs.targets.log2FC, N > 20)
p <- ggplot(df, aes(x = log2FoldChange, y = targets_log2FC_median, label = miR.family, size = N)) +
geom_point(colour = "#1C75BB", alpha = 0.6) +
#scale_color_manual(fill = "yellow") +
geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
xlab("miRNA expression Fold change log2 (HFK / HF)") +
ylab("Median of peak signal changes log2 (HFK / HF)") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size=12, margin = margin(t = 10)),
axis.title.y = element_text(size=12, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_line(size = 0.5),
axis.line.x = element_line(size = 0.5),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
p <- ggplotly(p)
p
p2 <- ggplot(df, aes(x = log2FoldChange, y = targets_log2FC_mean, label = miR.family, size = N)) +
geom_point(colour = "#EC469A", alpha = 0.6) +
#scale_color_manual(fill = "yellow") +
geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
xlab("miRNA expression Fold change log2 (HFK / HF)") +
ylab("Mean of peak signal changes log2 (HFK / HF)") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size=12, margin = margin(t = 10)),
axis.title.y = element_text(size=12, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_line(size = 0.5),
axis.line.x = element_line(size = 0.5),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
p2 <- ggplotly(p2)
p2
If we only look at the top 40 highly expressed miRNAs
mirna.family.DGE <- mirna.family.DGE[order(-mirna.family.DGE$baseMean),]
mirs.40 <- rownames(mirna.family.DGE)[1:40]
df.40 <- df[df$miR.family %in% mirs.40,]
p3 <- ggplot(df.40, aes(x = log2FoldChange, y = targets_log2FC_median, label = miR.family, size = N)) +
geom_point(colour = "#1C75BB", alpha = 0.6) +
#scale_color_manual(fill = "yellow") +
geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
xlab("Top 40 miRNA expression Fold change log2 (HFK / HF)") +
ylab("Median of peak signal changes log2 (HFK / HF)") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size=12, margin = margin(t = 10)),
axis.title.y = element_text(size=12, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_line(size = 0.5),
axis.line.x = element_line(size = 0.5),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
p3 <- ggplotly(p3)
p3
p4 <- ggplot(df.40, aes(x = log2FoldChange, y = targets_log2FC_mean, label = miR.family, size = N)) +
geom_point(colour = "#EC469A", alpha = 0.6) +
#scale_color_manual(fill = "yellow") +
geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
xlab("Top 40 miRNA expression Fold change log2 (HFK / HF)") +
ylab("Mean of peak signal changes log2 (HFK / HF)") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size=12, margin = margin(t = 10)),
axis.title.y = element_text(size=12, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_line(size = 0.5),
axis.line.x = element_line(size = 0.5),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
p4 <- ggplotly(p4)
p4
Here are the top 10 miRNAs with the most peaks associated.
color.vec <- brewer.pal(name = "Spectral", n = 11)
df <- as.data.frame(filter.peaks)
df$hfk.hf.logP <- -log10(df$hfk.hf.padj)
len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)
mirs.peaks.names <- lapply(mirs.peaks, function(x) names(x))
for (i in 1:10){
p <- ggplot() +
geom_point(data = df, aes(x = hfk.hf.log2FC, y = hfk.hf.logP), size = 1, alpha = 0.5, color = "grey80") +
geom_point(data = df[mirs.peaks.names[[mirna[i]]],], aes(x = hfk.hf.log2FC, y = hfk.hf.logP), size = 1, alpha = 0.7, color = "#7570B3") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", size = 0.3) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", size = 0.3) +
xlab("Fold change log2 (HFK / HF)") +
ylab("-log10(FDR)") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size=12, margin = margin(t = 10)),
axis.title.y = element_text(size=12, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_line(size = 0.5),
axis.line.x = element_line(size = 0.5),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold")) +
#guides(fill = guide_legend(title = "miRNA family")) +
ggtitle(sprintf("%s targets", mirna[i ]))
print(p)
}
I just want a table of all top 4o miRNAs and the number of peaks that are associated with them with LFC (HFK/HF) > 0 and < 0.
peak.number <- as.data.frame(table(mirs.peaks[[1]]$hfk.hf.log2FC > 0))
peak.number <- t(peak.number[,-1])
colnames(peak.number) <- c("LFC(HFK/HF) < 0", "LFC(HFK/HF) > 0")
for (i in 2: 40) {
new.peak <- as.data.frame(table(mirs.peaks[[i]]$hfk.hf.log2FC > 0))
new.peak <- t(new.peak[,-1])
peak.number <- rbind(peak.number, new.peak)
}
rownames(peak.number) <- names(mirs.peaks)[1:40]
peak.number <- rbind(peak.number, c(sum(df$hfk.hf.log2FC > 0), sum(df$hfk.hf.log2FC < 0)))
rownames(peak.number)[dim(peak.number)[1]] <- "Total"
peak.number
## LFC(HFK/HF) < 0 LFC(HFK/HF) > 0
## let-7-5p/miR-98-5p 172 184
## miR-194-5p 92 135
## miR-200-3p/429-3p 88 113
## miR-15-5p/16-5p/195-5p/322-5p/497-5p 104 90
## miR-29-3p 75 82
## miR-24-3p 83 62
## miR-22-3p 70 59
## miR-103-3p/107-3p 71 53
## miR-27-3p 60 59
## miR-26-5p 52 61
## miR-17-5p/20-5p/93-5p/106-5p 49 59
## miR-141-3p 56 49
## miR-196-5p 45 59
## miR-23-3p 49 40
## miR-30-5p/384-5p 57 26
## miR-484 42 38
## miR-19-3p 41 33
## miR-96-5p 28 39
## miR-7-5p 23 41
## miR-182-5p 26 35
## miR-3058-5p 28 32
## miR-210-3p 29 29
## miR-147-3p 35 23
## miR-148-3p/152-3p 37 21
## miR-205-5p 32 22
## miR-25-3p/32-5p/92-3p/363-3p/367-3p 30 19
## miR-130-3p/301-3p 24 21
## miR-30a-3p/30d-3p/30e-3p 18 26
## miR-34-5p/449-5p 24 20
## miR-185-5p 18 25
## miR-194-2-3p/6926-5p/7055-5p 14 29
## miR-378-3p 27 16
## miR-192-5p/215-5p 28 14
## miR-423-5p 18 22
## miR-671-5p 16 23
## miR-28-5p/708-5p 15 20
## miR-141-5p/6769b-3p 22 12
## miR-181-5p 16 18
## miR-139-5p 9 22
## miR-21ac-5p 19 12
## Total 1732 1568